Import Libraries¶
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency ,ttest_ind, f_oneway, shapiro, pointbiserialr, zscore, kendalltau, spearmanr, mannwhitneyu, kruskal,wilcoxon
from statsmodels.graphics.gofplots import qqplot
import itertools
import math
#to display all the rows/columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
Import dataset Full data set¶
In [ ]:
file_path = '/content/full_dataset_with_prediction_a.csv'
df_full = pd.read_csv(file_path)
df_full.head()
Out[Â ]:
| Age | Gender | Uncomplicated Hypertension | Complicated Hypertension | Uncomplicated Diabetes | Complicated Diabetes | Malignancy | Hematologic Disease | Metastasis | Peripheral Vascular Disease | Hypothyroidism | Chronic Heart Failure | Stroke | Liver Disease | SAPS II | SOFA | OASIS | Sepsis | Any Organ Failure | Severe Respiratory Failure | Severe Coagulation Failure | Severe Liver Failure | Severe Cardiovascular Failure | Severe Central Nervous System Failure | Severe Renal Failure | Respiratory Dysfunction | Cardiovascular Dysfunction | Renal Dysfunction | Hematologic Dysfunction | Metabolic Dysfunction | Neurologic Dysfunction | Max Heart Rate | Min Heart Rate | Mean Heart Rate | Max MAP | Min MAP | Mean MAP | Max Systolic Pressure | Min Systolic Pressure | Mean Systolic Pressure | Max Diastolic Pressure | Min Diastolic Pressure | Mean Diastolic Pressure | Max Temperature | Min Temperature | Mean Temperature | Max pH | Min pH | Mean pH | Max Glucose | Min Glucose | Mean Glucose | Max WBC | Min WBC | Mean WBC | Max BUN | Min BUN | Mean BUN | Max Creatinine | Min Creatinine | Mean Creatinine | Max Hemoglobin | Min Hemoglobin | Mean Hemoglobin | Hospital Mortality | Predicted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 65 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 67 | 12 | 41 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 87.0 | 60.0 | 71.466667 | 89.0 | 57.000000 | 70.200000 | 147.0 | 83.0 | 108.548387 | 65.0 | 43.0 | 54.548387 | 38.200001 | 36.099998 | 37.310714 | 7.42 | 7.29 | 7.35 | 175.0 | 76.0 | 124.94 | 21.7 | 14.6 | 18.05 | 23.0 | 18.0 | 20.67 | 1.5 | 1.1 | 1.33 | 11.2 | 9.3 | 10.08 | 0 | 0 |
| 1 | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | 1 | 35 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 129.0 | 69.0 | 85.459459 | 143.0 | 69.000000 | 89.085714 | 195.0 | 120.0 | 141.500000 | 127.0 | 55.0 | 72.705882 | 36.222221 | 35.166668 | 35.870370 | 7.57 | 7.26 | 7.41 | 204.0 | 134.0 | 169.00 | 6.3 | 6.3 | 6.30 | 13.0 | 13.0 | 13.00 | 0.7 | 0.6 | 0.65 | 12.6 | 12.6 | 12.60 | 0 | 1 |
| 2 | 76 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 42 | 4 | 30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 80.0 | 53.0 | 73.258065 | 80.0 | 53.000000 | 66.781250 | 134.0 | 89.0 | 111.593750 | 66.0 | 38.0 | 48.625000 | 36.500000 | 35.444444 | 35.920635 | 7.46 | 7.29 | 7.38 | 173.0 | 107.0 | 149.33 | 12.9 | 9.3 | 10.87 | 28.0 | 27.0 | 27.50 | 1.0 | 1.0 | 1.00 | 12.0 | 10.1 | 10.60 | 0 | 0 |
| 3 | 53 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 29 | 9 | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 99.0 | 53.0 | 74.454545 | 107.0 | 59.000000 | 73.607841 | 188.0 | 90.0 | 115.480769 | 104.0 | 42.0 | 54.153846 | 37.099998 | 36.700001 | 36.990322 | 7.49 | 7.27 | 7.39 | 240.0 | 72.0 | 157.46 | 16.6 | 9.9 | 14.43 | 36.0 | 23.0 | 29.60 | 1.3 | 1.1 | 1.22 | 11.1 | 7.1 | 10.01 | 0 | 0 |
| 4 | 36 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | 4 | 42 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 156.0 | 107.0 | 117.964286 | 201.0 | 72.333298 | 104.595225 | 162.0 | 91.0 | 135.655172 | 113.0 | 53.0 | 78.344828 | 38.000001 | 36.666667 | 37.333334 | 7.38 | 7.19 | 7.28 | 364.0 | 141.0 | 223.00 | 15.2 | 8.7 | 12.13 | 27.0 | 21.0 | 25.00 | 1.9 | 1.5 | 1.68 | 15.4 | 13.4 | 14.33 | 0 | 1 |
Change the Value of 'Gender' Variable¶
In [ ]:
value_map = {'M': 1, 'F': 0} # '0' for female and '1' for Male
df['Gender'] = df['Gender'].replace(value_map)
In [ ]:
df_t.head()
Out[Â ]:
| Age | Gender | Uncomplicated Hypertension | Complicated Hypertension | Uncomplicated Diabetes | Complicated Diabetes | Malignancy | Hematologic Disease | Metastasis | Peripheral Vascular Disease | Hypothyroidism | Chronic Heart Failure | Stroke | Liver Disease | SAPS II | SOFA | OASIS | Sepsis | Any Organ Failure | Severe Respiratory Failure | Severe Coagulation Failure | Severe Liver Failure | Severe Cardiovascular Failure | Severe Central Nervous System Failure | Severe Renal Failure | Respiratory Dysfunction | Cardiovascular Dysfunction | Renal Dysfunction | Hematologic Dysfunction | Metabolic Dysfunction | Neurologic Dysfunction | Max Heart Rate | Min Heart Rate | Mean Heart Rate | Max MAP | Min MAP | Mean MAP | Max Systolic Pressure | Min Systolic Pressure | Mean Systolic Pressure | Max Diastolic Pressure | Min Diastolic Pressure | Mean Diastolic Pressure | Max Temperature | Min Temperature | Mean Temperature | Max pH | Min pH | Mean pH | Max Glucose | Min Glucose | Mean Glucose | Max WBC | Min WBC | Mean WBC | Max BUN | Min BUN | Mean BUN | Max Creatinine | Min Creatinine | Mean Creatinine | Max Hemoglobin | Min Hemoglobin | Mean Hemoglobin | Hospital Mortality | Predicted | Actual | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 65 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 67 | 12 | 41 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 87.0 | 60.0 | 71.466667 | 89.0 | 57.0 | 70.200000 | 147.0 | 83.0 | 108.548387 | 65.0 | 43.0 | 54.548387 | 38.200001 | 36.099998 | 37.310714 | 7.42 | 7.29 | 7.35 | 175.0 | 76.0 | 124.94 | 21.7 | 14.6 | 18.05 | 23.0 | 18.0 | 20.67 | 1.5 | 1.1 | 1.33 | 11.2 | 9.3 | 10.08 | 0 | 0 | 0 |
| 1 | 76 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 42 | 4 | 30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 80.0 | 53.0 | 73.258065 | 80.0 | 53.0 | 66.781250 | 134.0 | 89.0 | 111.593750 | 66.0 | 38.0 | 48.625000 | 36.500000 | 35.444444 | 35.920635 | 7.46 | 7.29 | 7.38 | 173.0 | 107.0 | 149.33 | 12.9 | 9.3 | 10.87 | 28.0 | 27.0 | 27.50 | 1.0 | 1.0 | 1.00 | 12.0 | 10.1 | 10.60 | 0 | 0 | 0 |
| 2 | 53 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 29 | 9 | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 99.0 | 53.0 | 74.454545 | 107.0 | 59.0 | 73.607841 | 188.0 | 90.0 | 115.480769 | 104.0 | 42.0 | 54.153846 | 37.099998 | 36.700001 | 36.990322 | 7.49 | 7.27 | 7.39 | 240.0 | 72.0 | 157.46 | 16.6 | 9.9 | 14.43 | 36.0 | 23.0 | 29.60 | 1.3 | 1.1 | 1.22 | 11.1 | 7.1 | 10.01 | 0 | 0 | 0 |
| 3 | 42 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 24 | 4 | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 102.0 | 72.0 | 90.185185 | 85.0 | 58.0 | 70.148148 | 128.0 | 82.0 | 106.222222 | 70.0 | 47.0 | 56.907407 | 38.099998 | 35.799999 | 37.257408 | 7.46 | 7.29 | 7.39 | 228.0 | 75.0 | 130.88 | 23.1 | 9.3 | 16.20 | 13.0 | 13.0 | 13.00 | 0.8 | 0.8 | 0.80 | 16.1 | 7.6 | 10.56 | 0 | 0 | 0 |
| 4 | 77 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 39 | 5 | 31 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 94.0 | 71.0 | 80.517241 | 85.0 | 56.0 | 71.068966 | 136.0 | 86.0 | 116.000000 | 65.0 | 41.0 | 52.724138 | 38.200000 | 36.400000 | 37.168966 | 7.44 | 7.29 | 7.36 | 189.0 | 92.0 | 132.36 | 14.6 | 9.8 | 11.90 | 20.0 | 18.0 | 19.00 | 1.0 | 1.0 | 1.00 | 14.1 | 10.4 | 11.93 | 0 | 0 | 0 |
Divide dataset into 6 different datasets of true and fasle values for comparison¶
In [ ]:
df_true_pos = df_full[(df_full['Predicted'] == 1) & (df_full['Hospital Mortality'] == 1)] # to extract true positive observations
df_false_pos = df_full[(df_full['Predicted'] == 1) & (df_full['Hospital Mortality'] == 0)] # to extract false positive observations
df_true_neg = df_full[(df_full['Predicted'] == 0) & (df_full['Hospital Mortality'] == 0)] # to extract true negative observations
df_false_neg = df_full[(df_full['Predicted'] == 0) & (df_full['Hospital Mortality'] == 1)] # to extract false negatibe observations
df_true_values = df_full[((df_full['Predicted'] == 1) & (df_full['Hospital Mortality'] == 1)) | ((df_full['Predicted'] == 0) & (df_full['Hospital Mortality'] == 0))] # to extract true positive and true negative observations
df_false_values = df_full[((df_full['Predicted'] == 1) & (df_full['Hospital Mortality'] == 0)) | ((df_full['Predicted'] == 0) & (df_full['Hospital Mortality'] == 1))] # to extract false positive and false negative observations
Dimentions¶
In [ ]:
df_true_pos.shape
Out[Â ]:
(606, 66)
In [ ]:
df_false_pos.shape
Out[Â ]:
(195, 66)
In [ ]:
df_true_neg.shape
Out[Â ]:
(542, 66)
In [ ]:
df_false_neg.shape
Out[Â ]:
(131, 66)
In [ ]:
df_true_values.shape
Out[Â ]:
(1148, 66)
In [ ]:
df_false_values.shape
Out[Â ]:
(326, 66)
Confusion Matrix¶
In [ ]:
from sklearn.metrics import confusion_matrix
file_path = 'fullpredicted.csv'
df_full = pd.read_csv(file_path)
cm = confusion_matrix(df_full['Actual'], df_full['Predicted'])
# Convert confusion matrix to DataFrame for better visualization
cm_df = pd.DataFrame(cm, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive'])
print("Confusion Matrix:")
print(cm_df)
sns.heatmap(cm_df, annot=True,cmap="OrRd",fmt=',d')
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-12-4eac00ff6edd> in <cell line: 3>() 1 from sklearn.metrics import confusion_matrix 2 file_path = 'fullpredicted.csv' ----> 3 df_full = pd.read_csv(file_path) 4 5 cm = confusion_matrix(df_full['Actual'], df_full['Predicted']) /usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) 212 213 return cast(F, wrapper) /usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs) 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(*args, **kwargs) 332 333 # error: "Callable[[VarArg(Any), KwArg(Any)], Any]" has no /usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options) 948 kwds.update(kwds_defaults) 949 --> 950 return _read(filepath_or_buffer, kwds) 951 952 /usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds) 603 604 # Create the parser. --> 605 parser = TextFileReader(filepath_or_buffer, **kwds) 606 607 if chunksize or iterator: /usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds) 1440 1441 self.handles: IOHandles | None = None -> 1442 self._engine = self._make_engine(f, self.engine) 1443 1444 def close(self) -> None: /usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in _make_engine(self, f, engine) 1733 if "b" not in mode: 1734 mode += "b" -> 1735 self.handles = get_handle( 1736 f, 1737 mode, /usr/local/lib/python3.10/dist-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 854 if ioargs.encoding and "b" not in ioargs.mode: 855 # Encoding --> 856 handle = open( 857 handle, 858 ioargs.mode, FileNotFoundError: [Errno 2] No such file or directory: 'fullpredicted.csv'
Box Plots True Positive vs False Positive¶
In [ ]:
df_concat = pd.concat([df_true_pos, df_false_pos], keys=['True Positive', 'False Positive'], names=['Group'])
# Define the box_plot function
def box_plot(var, ax):
sns.boxplot(x='Predicted', y=var, hue='Group', data=df_concat.reset_index(), ax=ax)
ax.set_title(f'{var}')
ax.set_xlabel('')
ax.set_ylabel(var)
ax.legend().remove()
# List of continuous variables
continuous_var = ["SAPS II", "SOFA", "OASIS","Max Heart Rate", "Min Heart Rate", "Mean Heart Rate", "Max MAP", "Min MAP", "Mean MAP", "Max Systolic Pressure", "Min Systolic Pressure", "Mean Systolic Pressure", "Max Diastolic Pressure", "Min Diastolic Pressure", "Mean Diastolic Pressure", "Max Temperature", "Min Temperature", "Mean Temperature", "Max pH", "Min pH", "Mean pH", "Max Glucose", "Min Glucose", "Mean Glucose", "Max WBC", "Min WBC", "Mean WBC", "Max BUN", "Min BUN", "Mean BUN", "Max Creatinine", "Min Creatinine", "Mean Creatinine", "Max Hemoglobin", "Min Hemoglobin", "Mean Hemoglobin"]
# Calculate number of rows needed
num_rows = math.ceil(len(continuous_var) / 7)
# Create subplots
fig, axes = plt.subplots(num_rows, 7, figsize=(20, num_rows*3))
axes = axes.flatten()
# Iterate over continuous variables and plot in subplots
for i, var in enumerate(continuous_var):
if i < len(axes):
box_plot(var, axes[i])
else:
break
# Hide any remaining empty subplots
for ax in axes[len(continuous_var):]:
ax.axis('off')
# Add a general legend
fig.legend(labels=['True Positive', 'False Positive'], loc='lower right', fontsize=16, title='Group')
# Add a general title for the entire scatter plot
plt.suptitle('Comparision between True Positive and False Positive', fontsize=30, fontweight='bold')
plt.tight_layout()
plt.show()
Shapiro Test for normality¶
In [ ]:
continous_var = ["Age","SAPS II", "SOFA", "OASIS","Max Heart Rate", "Min Heart Rate", "Mean Heart Rate", "Max MAP", "Min MAP", "Mean MAP", "Max Systolic Pressure", "Min Systolic Pressure", "Mean Systolic Pressure", "Max Diastolic Pressure", "Min Diastolic Pressure",
"Mean Diastolic Pressure", "Max Temperature", "Min Temperature", "Mean Temperature", "Max pH", "Min pH", "Mean pH", "Max Glucose", "Min Glucose", "Mean Glucose", "Max WBC", "Min WBC", "Mean WBC", "Max BUN", "Min BUN", "Mean BUN", "Max Creatinine", "Min Creatinine", "Mean Creatinine", "Max Hemoglobin", "Min Hemoglobin", "Mean Hemoglobin"]
for col in continous_var:
statistic, p_value = shapiro(df_full[col])
print("Shapiro-Wilk Test:")
print("Test Statistic:", statistic)
print("P-value:","{:.5f}".format(p_value))
print()
Shapiro-Wilk Test: Test Statistic: 0.9437509775161743 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9738647937774658 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9337024688720703 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9931221008300781 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9711896777153015 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9920005202293396 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.986447811126709 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.7188106775283813 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9546191096305847 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.970227062702179 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9645466804504395 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9810664653778076 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9779439568519592 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.8706220984458923 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9882334470748901 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9862073659896851 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.972098171710968 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.901451051235199 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9573802351951599 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9606291651725769 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9509005546569824 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9432029724121094 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9099737405776978 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9211623072624207 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9190303087234497 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9268007874488831 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.912987232208252 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9262189865112305 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.8415319919586182 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.8238466382026672 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.8329563736915588 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.6338029503822327 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.6145962476730347 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.6222485303878784 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9929233193397522 P-value: 0.00000 Shapiro-Wilk Test: Test Statistic: 0.9937649965286255 P-value: 0.00001 Shapiro-Wilk Test: Test Statistic: 0.980171263217926 P-value: 0.00000
Density Plots of True Positive and False Positive¶
In [ ]:
def density_plot(x, df_true_pos, df_false_pos, ax):
sns.kdeplot(df_true_pos[x], fill=True, ax=ax)
sns.kdeplot(df_false_pos[x], fill=True, ax=ax)
# Add labels and legend
ax.set_title(f'{x}')
ax.set_xlabel('Predicted Scores')
ax.set_ylabel('Density')
ax.legend()
# Calculate the number of rows needed based on the number of continuous variables
num_rows = (len(continous_var) + 7) // 7 # Ceiling division
# Create a figure and axis array with the specified number of rows and 6 columns
fig, axes = plt.subplots(num_rows, 7, figsize=(20, num_rows * 3))
# Flatten the axes array for easy iteration
axes = axes.flatten()
# Iterate over the continuous variables and corresponding axes
for x, ax in zip(continous_var, axes):
# If there are no more variables to plot, hide the axis
if x is None:
ax.axis('off')
continue
density_plot(x, df_true_pos, df_false_pos, ax)
# Hide any remaining empty subplots
for ax in axes[len(continous_var):]:
ax.axis('off')
# Add a general legend for the entire scatter plot
plt.figlegend(labels=['True Values', 'False Values'], loc='lower right', fontsize=16)
# Add a general title for the entire scatter plot
plt.suptitle('Density Plots for True Positive and False Positive values of Continuous Variables', fontsize=30, fontweight='bold')
plt.tight_layout()
plt.show()
WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
T-test for True Positive vs False Positive¶
In [ ]:
# Extracting only continuous variables
df_continuous_var = ["Age","SAPS II", "SOFA", "OASIS","Max Heart Rate", "Min Heart Rate", "Mean Heart Rate", "Max MAP", "Min MAP", "Mean MAP", "Max Systolic Pressure", "Min Systolic Pressure", "Mean Systolic Pressure", "Max Diastolic Pressure", "Min Diastolic Pressure","Mean Diastolic Pressure", "Max Temperature", "Min Temperature", "Mean Temperature", "Max pH", "Min pH", "Mean pH", "Max Glucose", "Min Glucose", "Mean Glucose", "Max WBC", "Min WBC", "Mean WBC", "Max BUN", "Min BUN", "Mean BUN", "Max Creatinine", "Min Creatinine", "Mean Creatinine", "Max Hemoglobin", "Min Hemoglobin", "Mean Hemoglobin"]
variables = []
p_values = []
for column in df_continuous_var:
true_pos = df_true_pos[column]
false_pos = df_false_pos[column]
t_statistic, p_value = ttest_ind(true_pos, false_pos)
print(f"P-value of {column}: {p_value:.5f}")
variables.append(column)
p_values.append(p_value)
# Sort variables and p_values based on p_values
sorted_variables, sorted_p_values = zip(*sorted(zip(variables, p_values), key=lambda x: x[1], reverse=True))
# Plotting the p-values in a graph
plt.figure(figsize=(10, 6))
plt.barh(np.arange(len(sorted_variables)), sorted_p_values, color='skyblue')
plt.yticks(np.arange(len(sorted_variables)), sorted_variables)
plt.xlabel('P-value')
plt.title('P-values of T-tests for Continuous Variables (True Positives vs False Positives)')
plt.axvline(0.05, color='r', linestyle='--', label='Significance level (0.05)')
plt.legend()
plt.show()
P-value of Age: 0.05422 P-value of SAPS II: 0.00000 P-value of SOFA: 0.00000 P-value of OASIS: 0.00000 P-value of Max Heart Rate: 0.32121 P-value of Min Heart Rate: 0.08374 P-value of Mean Heart Rate: 0.37215 P-value of Max MAP: 0.08868 P-value of Min MAP: 0.00004 P-value of Mean MAP: 0.00055 P-value of Max Systolic Pressure: 0.06189 P-value of Min Systolic Pressure: 0.00000 P-value of Mean Systolic Pressure: 0.00007 P-value of Max Diastolic Pressure: 0.00336 P-value of Min Diastolic Pressure: 0.00442 P-value of Mean Diastolic Pressure: 0.00366 P-value of Max Temperature: 0.00900 P-value of Min Temperature: 0.00003 P-value of Mean Temperature: 0.00003 P-value of Max pH: 0.00043 P-value of Min pH: 0.00029 P-value of Mean pH: 0.00004 P-value of Max Glucose: 0.08275 P-value of Min Glucose: 0.27195 P-value of Mean Glucose: 0.51764 P-value of Max WBC: 0.01355 P-value of Min WBC: 0.02549 P-value of Mean WBC: 0.02050 P-value of Max BUN: 0.05387 P-value of Min BUN: 0.00996 P-value of Mean BUN: 0.02537 P-value of Max Creatinine: 0.10473 P-value of Min Creatinine: 0.03166 P-value of Mean Creatinine: 0.06553 P-value of Max Hemoglobin: 0.34912 P-value of Min Hemoglobin: 0.22546 P-value of Mean Hemoglobin: 0.35980
Mann Whitney U Test¶
In [ ]:
variables = []
p_values = []
df_continuous_var = ["Age","SAPS II", "SOFA", "OASIS","Max Heart Rate", "Min Heart Rate", "Mean Heart Rate", "Max MAP", "Min MAP", "Mean MAP", "Max Systolic Pressure", "Min Systolic Pressure", "Mean Systolic Pressure", "Max Diastolic Pressure", "Min Diastolic Pressure","Mean Diastolic Pressure", "Max Temperature", "Min Temperature", "Mean Temperature", "Max pH", "Min pH", "Mean pH", "Max Glucose", "Min Glucose", "Mean Glucose", "Max WBC", "Min WBC", "Mean WBC", "Max BUN", "Min BUN", "Mean BUN", "Max Creatinine", "Min Creatinine", "Mean Creatinine", "Max Hemoglobin", "Min Hemoglobin", "Mean Hemoglobin"]
for col in df_continuous_var:
true_pos = df_true_pos[col]
false_pos = df_false_pos[col]
stat, p_value = mannwhitneyu(true_pos, false_pos)
print(f"P-value of {col}:", "{:.5f}".format(p_value))
variables.append(col)
p_values.append(p_value)
sorted_variables, sorted_p_values = zip(*sorted(zip(variables, p_values), key=lambda x: x[1], reverse=True))
# Plot the p-values
plt.figure(figsize=(10, 6))
plt.barh(np.arange(len(sorted_variables)), sorted_p_values, color='skyblue')
plt.yticks(np.arange(len(sorted_variables)), sorted_variables)
plt.xlabel('P-value')
plt.title('P-values of Mann Whitney for Continuous Variables (True Positives vs False Positives)')
plt.axvline(0.05, color='r', linestyle='--', label='Significance level (0.05)')
plt.legend()
plt.show()
P-value of Age: 0.07877 P-value of SAPS II: 0.00000 P-value of SOFA: 0.00000 P-value of OASIS: 0.00000 P-value of Max Heart Rate: 0.23041 P-value of Min Heart Rate: 0.10210 P-value of Mean Heart Rate: 0.37054 P-value of Max MAP: 0.02056 P-value of Min MAP: 0.00010 P-value of Mean MAP: 0.00029 P-value of Max Systolic Pressure: 0.01719 P-value of Min Systolic Pressure: 0.00000 P-value of Mean Systolic Pressure: 0.00001 P-value of Max Diastolic Pressure: 0.00107 P-value of Min Diastolic Pressure: 0.00196 P-value of Mean Diastolic Pressure: 0.00271 P-value of Max Temperature: 0.02914 P-value of Min Temperature: 0.00003 P-value of Mean Temperature: 0.00014 P-value of Max pH: 0.00409 P-value of Min pH: 0.00161 P-value of Mean pH: 0.00034 P-value of Max Glucose: 0.24350 P-value of Min Glucose: 0.27919 P-value of Mean Glucose: 0.60837 P-value of Max WBC: 0.01624 P-value of Min WBC: 0.06511 P-value of Mean WBC: 0.03238 P-value of Max BUN: 0.06239 P-value of Min BUN: 0.00699 P-value of Mean BUN: 0.02722 P-value of Max Creatinine: 0.01927 P-value of Min Creatinine: 0.00802 P-value of Mean Creatinine: 0.01348 P-value of Max Hemoglobin: 0.19733 P-value of Min Hemoglobin: 0.20942 P-value of Mean Hemoglobin: 0.23240
Box Plots True Negative vs False Negative¶
In [ ]:
df_concat = pd.concat([df_true_neg, df_false_neg], keys=['True Negative', 'False Negative'], names=['Group'])
# Define the box_plot function
def box_plot(var, ax):
sns.boxplot(x='Predicted', y=var, hue='Group', data=df_concat.reset_index(), ax=ax)
ax.set_title(f'{var}')
ax.set_xlabel('')
ax.set_ylabel(var)
ax.legend().remove()
# List of continuous variables
continuous_var = ["Age","SAPS II", "SOFA", "OASIS","Max Heart Rate", "Min Heart Rate", "Mean Heart Rate", "Max MAP", "Min MAP", "Mean MAP", "Max Systolic Pressure", "Min Systolic Pressure", "Mean Systolic Pressure", "Max Diastolic Pressure", "Min Diastolic Pressure", "Mean Diastolic Pressure", "Max Temperature", "Min Temperature", "Mean Temperature", "Max pH", "Min pH", "Mean pH", "Max Glucose", "Min Glucose", "Mean Glucose", "Max WBC", "Min WBC", "Mean WBC", "Max BUN", "Min BUN", "Mean BUN", "Max Creatinine", "Min Creatinine", "Mean Creatinine", "Max Hemoglobin", "Min Hemoglobin", "Mean Hemoglobin"]
# Calculate number of rows needed
num_rows = math.ceil(len(continuous_var) / 7)
# Create subplots
fig, axes = plt.subplots(num_rows, 7, figsize=(20, num_rows*3))
axes = axes.flatten()
# Iterate over continuous variables and plot in subplots
for i, var in enumerate(continuous_var):
if i < len(axes):
box_plot(var, axes[i])
else:
break
# Hide any remaining empty subplots
for ax in axes[len(continuous_var):]:
ax.axis('off')
# Add a general legend
fig.legend(labels=['True Negative', 'False Negative'], loc='lower right', fontsize=16, title='Group')
# Add a general title for the entire scatter plot
plt.suptitle('Comparision between True Negative and False Negative', fontsize=30, fontweight='bold')
plt.tight_layout()
plt.show()
Density Plots of True Negative and False Negative¶
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
def density_plot(x, df_true_neg, df_false_neg, ax):
sns.kdeplot(df_true_neg[x], fill=True, ax=ax)
sns.kdeplot(df_false_neg[x], fill=True, ax=ax)
# Add labels and legend
ax.set_title(f'{x}')
ax.set_xlabel('Predicted Scores')
ax.set_ylabel('Density')
ax.legend()
# Calculate the number of rows needed based on the number of continuous variables
num_rows = (len(continous_var) + 7) // 7 # Ceiling division
# Create a figure and axis array with the specified number of rows and 8 columns
fig, axes = plt.subplots(num_rows, 7, figsize=(20, num_rows * 3))
# Flatten the axes array for easy iteration
axes = axes.flatten()
# Iterate over the continuous variables and corresponding axes
for x, ax in zip(continous_var, axes):
# If there are no more variables to plot, hide the axis
if x is None:
ax.axis('off')
continue
density_plot(x, df_true_neg, df_false_neg, ax)
# Hide any remaining empty subplots
for ax in axes[len(continous_var):]:
ax.axis('off')
# Add a general legend for the entire scatter plot
plt.figlegend(labels=['True Values', 'False Values'], loc='lower right', fontsize=16)
# Add a general title for the entire scatter plot
plt.suptitle('Density Plots for True Negative and False Negative values of Continuous Variables', fontsize=30, fontweight='bold')
plt.tight_layout()
plt.show()
WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
T-test for True Negative vs False Negative¶
In [ ]:
# Extracting only continuous variables
df_continuous_var = ["Age","SAPS II", "SOFA", "OASIS","Max Heart Rate", "Min Heart Rate", "Mean Heart Rate", "Max MAP", "Min MAP", "Mean MAP", "Max Systolic Pressure", "Min Systolic Pressure", "Mean Systolic Pressure", "Max Diastolic Pressure", "Min Diastolic Pressure","Mean Diastolic Pressure", "Max Temperature", "Min Temperature", "Mean Temperature", "Max pH", "Min pH", "Mean pH", "Max Glucose", "Min Glucose", "Mean Glucose", "Max WBC", "Min WBC", "Mean WBC", "Max BUN", "Min BUN", "Mean BUN", "Max Creatinine", "Min Creatinine", "Mean Creatinine", "Max Hemoglobin", "Min Hemoglobin", "Mean Hemoglobin"]
variables = []
p_values = []
for column in df_continuous_var:
true_neg = df_true_neg[column]
false_neg = df_false_neg[column]
t_statistic, p_value = ttest_ind(true_neg, false_neg)
print(f"P-value of {column}: {p_value:.5f}")
variables.append(column)
p_values.append(p_value)
# Sort variables and p_values based on p_values
sorted_variables, sorted_p_values = zip(*sorted(zip(variables, p_values), key=lambda x: x[1], reverse=True))
# Plotting the p-values in a graph
plt.figure(figsize=(10, 6))
plt.barh(np.arange(len(sorted_variables)), sorted_p_values, color='skyblue')
plt.yticks(np.arange(len(sorted_variables)), sorted_variables)
plt.xlabel('P-value')
plt.title('P-values of T-tests for Continuous Variables (True Negatives vs False Negatives)')
plt.axvline(0.05, color='r', linestyle='--', label='Significance level (0.05)')
plt.legend()
plt.show()
P-value of Age: 0.03328 P-value of SAPS II: 0.00019 P-value of SOFA: 0.32780 P-value of OASIS: 0.04823 P-value of Max Heart Rate: 0.56985 P-value of Min Heart Rate: 0.14249 P-value of Mean Heart Rate: 0.46821 P-value of Max MAP: 0.02506 P-value of Min MAP: 0.87321 P-value of Mean MAP: 0.00000 P-value of Max Systolic Pressure: 0.00000 P-value of Min Systolic Pressure: 0.06807 P-value of Mean Systolic Pressure: 0.00000 P-value of Max Diastolic Pressure: 0.04111 P-value of Min Diastolic Pressure: 0.65160 P-value of Mean Diastolic Pressure: 0.01598 P-value of Max Temperature: 0.13346 P-value of Min Temperature: 0.34189 P-value of Mean Temperature: 0.82163 P-value of Max pH: 0.49452 P-value of Min pH: 0.48933 P-value of Mean pH: 0.41530 P-value of Max Glucose: 0.90527 P-value of Min Glucose: 0.00262 P-value of Mean Glucose: 0.21587 P-value of Max WBC: 0.32148 P-value of Min WBC: 0.07676 P-value of Mean WBC: 0.15066 P-value of Max BUN: 0.06567 P-value of Min BUN: 0.08726 P-value of Mean BUN: 0.07487 P-value of Max Creatinine: 0.02917 P-value of Min Creatinine: 0.04930 P-value of Mean Creatinine: 0.03102 P-value of Max Hemoglobin: 0.00875 P-value of Min Hemoglobin: 0.00200 P-value of Mean Hemoglobin: 0.19756
Boxplots of True values with False values¶
In [ ]:
df_concat = pd.concat([df_true_values, df_false_values], keys=['True', 'False'], names=['Group'])
# Define the box_plot function
def box_plot(var, ax):
sns.boxplot(x='Predicted', y=var, hue='Group', data=df_concat.reset_index(), ax=ax)
ax.set_title(f'{var}')
ax.set_xlabel('')
ax.set_ylabel(var)
ax.legend().remove()
# List of continuous variables
continuous_var = ["SAPS II", "SOFA", "OASIS","Max Heart Rate", "Min Heart Rate", "Mean Heart Rate", "Max MAP", "Min MAP", "Mean MAP", "Max Systolic Pressure", "Min Systolic Pressure", "Mean Systolic Pressure", "Max Diastolic Pressure", "Min Diastolic Pressure", "Mean Diastolic Pressure", "Max Temperature", "Min Temperature", "Mean Temperature", "Max pH", "Min pH", "Mean pH", "Max Glucose", "Min Glucose", "Mean Glucose", "Max WBC", "Min WBC", "Mean WBC", "Max BUN", "Min BUN", "Mean BUN", "Max Creatinine", "Min Creatinine", "Mean Creatinine", "Max Hemoglobin", "Min Hemoglobin", "Mean Hemoglobin"]
# Calculate number of rows needed
num_rows = math.ceil(len(continuous_var) / 7)
# Create subplots
fig, axes = plt.subplots(num_rows, 7, figsize=(20, num_rows*3))
axes = axes.flatten()
# Iterate over continuous variables and plot in subplots
for i, var in enumerate(continuous_var):
if i < len(axes):
box_plot(var, axes[i])
else:
break
# Hide any remaining empty subplots
for ax in axes[len(continuous_var):]:
ax.axis('off')
# Add a general legend
fig.legend(labels=['True', 'False'], loc='lower right', fontsize=16, title='Group')
# Add a general title for the entire scatter plot
plt.suptitle('Comparision between True Values and False Values', fontsize=30, fontweight='bold')
plt.tight_layout()
plt.show()
Density Plots of True values and False Values¶
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
def density_plot(x, df_true_values, df_false_values, ax):
sns.kdeplot(df_true_values[x], fill=True, ax=ax)
sns.kdeplot(df_false_values[x], fill=True, ax=ax)
# Add labels and legend
ax.set_title(f'{x}')
ax.set_xlabel('Predicted Scores')
ax.set_ylabel('Density')
ax.legend()
# Calculate the number of rows needed based on the number of continuous variables
num_rows = (len(continous_var) + 6) // 7 # Ceiling division
# Create a figure and axis array with the specified number of rows and 8 columns
fig, axes = plt.subplots(num_rows, 7, figsize=(20, num_rows * 3))
# Flatten the axes array for easy iteration
axes = axes.flatten()
# Iterate over the continuous variables and corresponding axes
for x, ax in zip(continous_var, axes):
# If there are no more variables to plot, hide the axis
if x is None:
ax.axis('off')
continue
density_plot(x, df_true_values, df_false_values, ax)
# Hide any remaining empty subplots
for ax in axes[len(continous_var):]:
ax.axis('off')
# Add a general legend for the entire scatter plot
plt.figlegend(labels=['True Values', 'False Values'], loc='lower right', fontsize=16)
# Add a general title for the entire scatter plot
plt.suptitle('Density Plots for True and False values of Continuous Variables', fontsize=30, fontweight='bold')
plt.tight_layout()
plt.show()
WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
T-test for True values vs False values¶
In [ ]:
# Extracting only continuous variables
df_continuous_var = ["Age","SAPS II", "SOFA", "OASIS","Max Heart Rate", "Min Heart Rate", "Mean Heart Rate", "Max MAP", "Min MAP", "Mean MAP", "Max Systolic Pressure", "Min Systolic Pressure", "Mean Systolic Pressure", "Max Diastolic Pressure", "Min Diastolic Pressure","Mean Diastolic Pressure", "Max Temperature", "Min Temperature", "Mean Temperature", "Max pH", "Min pH", "Mean pH", "Max Glucose", "Min Glucose", "Mean Glucose", "Max WBC", "Min WBC", "Mean WBC", "Max BUN", "Min BUN", "Mean BUN", "Max Creatinine", "Min Creatinine", "Mean Creatinine", "Max Hemoglobin", "Min Hemoglobin", "Mean Hemoglobin"]
variables = []
p_values = []
for column in df_continuous_var:
true_pos = df_true_values[column]
false_pos = df_false_values[column]
t_statistic, p_value = ttest_ind(true_pos, false_pos)
print(f"P-value of {column}: {p_value:.5f}")
variables.append(column)
p_values.append(p_value)
# Sort variables and p_values based on p_values
sorted_variables, sorted_p_values = zip(*sorted(zip(variables, p_values), key=lambda x: x[1], reverse=True))
# Plotting the p-values in a graph
plt.figure(figsize=(10, 6))
plt.barh(np.arange(len(sorted_variables)), sorted_p_values, color='skyblue')
plt.yticks(np.arange(len(sorted_variables)), sorted_variables)
plt.xlabel('P-value')
plt.title('P-values of T-tests for Continuous Variables (True Values vs False Values)')
plt.axvline(0.05, color='r', linestyle='--', label='Significance level (0.05)')
plt.legend()
plt.show()
P-value of Age: 0.61497 P-value of SAPS II: 0.14265 P-value of SOFA: 0.00003 P-value of OASIS: 0.07948 P-value of Max Heart Rate: 0.93380 P-value of Min Heart Rate: 0.34607 P-value of Mean Heart Rate: 0.52893 P-value of Max MAP: 0.00511 P-value of Min MAP: 0.00296 P-value of Mean MAP: 0.00000 P-value of Max Systolic Pressure: 0.00005 P-value of Min Systolic Pressure: 0.00001 P-value of Mean Systolic Pressure: 0.00000 P-value of Max Diastolic Pressure: 0.00016 P-value of Min Diastolic Pressure: 0.04571 P-value of Mean Diastolic Pressure: 0.00031 P-value of Max Temperature: 0.00379 P-value of Min Temperature: 0.00248 P-value of Mean Temperature: 0.00026 P-value of Max pH: 0.02291 P-value of Min pH: 0.01006 P-value of Mean pH: 0.00031 P-value of Max Glucose: 0.33188 P-value of Min Glucose: 0.00437 P-value of Mean Glucose: 0.55742 P-value of Max WBC: 0.14600 P-value of Min WBC: 0.34880 P-value of Mean WBC: 0.23963 P-value of Max BUN: 0.97997 P-value of Min BUN: 0.51364 P-value of Mean BUN: 0.77214 P-value of Max Creatinine: 0.83034 P-value of Min Creatinine: 0.74198 P-value of Mean Creatinine: 0.98873 P-value of Max Hemoglobin: 0.25010 P-value of Min Hemoglobin: 0.00259 P-value of Mean Hemoglobin: 0.13727
Categorical - Mode¶
In [ ]:
# Get the mode of the Gender column in the True Positive group
mode_gender = df_concat.loc['True Positive', 'Complicated Diabetes'].mode()[0]
# Print the mode
print(mode_gender)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3801 try: -> 3802 return self._engine.get_loc(casted_key) 3803 except KeyError as err: /usr/local/lib/python3.10/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() /usr/local/lib/python3.10/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'True Positive' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-119-058b76b7089a> in <cell line: 2>() 1 # Get the mode of the Gender column in the True Positive group ----> 2 mode_gender = df_concat.loc['True Positive', 'Complicated Diabetes'].mode()[0] 3 4 # Print the mode 5 print(mode_gender) /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in __getitem__(self, key) 1065 if self._is_scalar_access(key): 1066 return self.obj._get_value(*key, takeable=self._takeable) -> 1067 return self._getitem_tuple(key) 1068 else: 1069 # we by definition only have the 0th axis /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in _getitem_tuple(self, tup) 1245 with suppress(IndexingError): 1246 tup = self._expand_ellipsis(tup) -> 1247 return self._getitem_lowerdim(tup) 1248 1249 # no multi-index, so validate all of the indexers /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in _getitem_lowerdim(self, tup) 965 # We don't need to check for tuples here because those are 966 # caught by the _is_nested_tuple_indexer check above. --> 967 section = self._getitem_axis(key, axis=i) 968 969 # We should never have a scalar section here, because /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1310 # fall thru to straight lookup 1311 self._validate_key(key, axis) -> 1312 return self._get_label(key, axis=axis) 1313 1314 def _get_slice_axis(self, slice_obj: slice, axis: int): /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in _get_label(self, label, axis) 1258 def _get_label(self, label, axis: int): 1259 # GH#5567 this will fail if the label is not present in the axis. -> 1260 return self.obj.xs(label, axis=axis) 1261 1262 def _handle_lowerdim_multi_index_axis0(self, tup: tuple): /usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level) 4047 4048 if isinstance(index, MultiIndex): -> 4049 loc, new_index = index._get_loc_level(key, level=0) 4050 if not drop_level: 4051 if lib.is_integer(loc): /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/multi.py in _get_loc_level(self, key, level) 3158 return indexer, maybe_mi_droplevels(indexer, ilevels) 3159 else: -> 3160 indexer = self._get_level_indexer(key, level=level) 3161 if ( 3162 isinstance(key, str) /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer) 3261 else: 3262 -> 3263 idx = self._get_loc_single_level_index(level_index, key) 3264 3265 if level > 0 or self._lexsort_depth == 0: /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/multi.py in _get_loc_single_level_index(self, level_index, key) 2847 return -1 2848 else: -> 2849 return level_index.get_loc(key) 2850 2851 def get_loc(self, key, method=None): /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3802 return self._engine.get_loc(casted_key) 3803 except KeyError as err: -> 3804 raise KeyError(key) from err 3805 except TypeError: 3806 # If we have a listlike key, _check_indexing_error will raise KeyError: 'True Positive'
In [ ]:
cat_variables = ['Gender','Uncomplicated Hypertension', 'Complicated Hypertension', 'Uncomplicated Diabetes',
'Complicated Diabetes', 'Malignancy', 'Hematologic Disease',
'Metastasis', 'Peripheral Vascular Disease', 'Hypothyroidism',
'Chronic Heart Failure', 'Stroke', 'Liver Disease','Sepsis', 'Any Organ Failure',
'Severe Respiratory Failure',
'Severe Coagulation Failure', 'Severe Liver Failure',
'Severe Cardiovascular Failure','Severe Central Nervous System Failure', 'Severe Renal Failure',
'Respiratory Dysfunction', 'Cardiovascular Dysfunction',
'Renal Dysfunction', 'Hematologic Dysfunction', 'Metabolic Dysfunction', 'Neurologic Dysfunction']
df_concat = pd.concat([df_t[df_t['Predicted'] == 1], df_fp], keys=['True Positive', 'False Positive'], names=['Group'])
for category in cat_variables:
if df_concat.loc['True Positive', category].mode()[0] != df_concat.loc['False Positive', category].mode()[0]:
print(f'There is the different in mode of {category}')
print(f"Mode of {category} of True Positive: {df_concat.loc['True Positive', category].mode()[0]} - False Positive: {df_concat.loc['False Positive', category].mode()[0]}")
Mode of Gender of True Positive: 1 - False Positive: 1 Mode of Uncomplicated Hypertension of True Positive: 0 - False Positive: 0 Mode of Complicated Hypertension of True Positive: 0 - False Positive: 0 Mode of Uncomplicated Diabetes of True Positive: 0 - False Positive: 0 Mode of Complicated Diabetes of True Positive: 0 - False Positive: 0 Mode of Malignancy of True Positive: 0 - False Positive: 0 Mode of Hematologic Disease of True Positive: 0 - False Positive: 0 Mode of Metastasis of True Positive: 0 - False Positive: 0 Mode of Peripheral Vascular Disease of True Positive: 0 - False Positive: 0 Mode of Hypothyroidism of True Positive: 0 - False Positive: 0 Mode of Chronic Heart Failure of True Positive: 0 - False Positive: 0 Mode of Stroke of True Positive: 0 - False Positive: 0 Mode of Liver Disease of True Positive: 0 - False Positive: 0 Mode of Sepsis of True Positive: 0 - False Positive: 0 Mode of Any Organ Failure of True Positive: 1 - False Positive: 1 Mode of Severe Respiratory Failure of True Positive: 0 - False Positive: 0 Mode of Severe Coagulation Failure of True Positive: 0 - False Positive: 0 Mode of Severe Liver Failure of True Positive: 0 - False Positive: 0 Mode of Severe Cardiovascular Failure of True Positive: 0 - False Positive: 0 Mode of Severe Central Nervous System Failure of True Positive: 0 - False Positive: 0 Mode of Severe Renal Failure of True Positive: 0 - False Positive: 0 Mode of Respiratory Dysfunction of True Positive: 0 - False Positive: 0 Mode of Cardiovascular Dysfunction of True Positive: 0 - False Positive: 0 Mode of Renal Dysfunction of True Positive: 0 - False Positive: 0 Mode of Hematologic Dysfunction of True Positive: 0 - False Positive: 0 Mode of Metabolic Dysfunction of True Positive: 0 - False Positive: 0 Mode of Neurologic Dysfunction of True Positive: 0 - False Positive: 0
In [ ]:
cat_variables = ['Gender','Uncomplicated Hypertension', 'Complicated Hypertension', 'Uncomplicated Diabetes',
'Complicated Diabetes', 'Malignancy', 'Hematologic Disease',
'Metastasis', 'Peripheral Vascular Disease', 'Hypothyroidism',
'Chronic Heart Failure', 'Stroke', 'Liver Disease','Sepsis', 'Any Organ Failure',
'Severe Respiratory Failure',
'Severe Coagulation Failure', 'Severe Liver Failure',
'Severe Cardiovascular Failure','Severe Central Nervous System Failure', 'Severe Renal Failure',
'Respiratory Dysfunction', 'Cardiovascular Dysfunction',
'Renal Dysfunction', 'Hematologic Dysfunction', 'Metabolic Dysfunction', 'Neurologic Dysfunction']
df_concat = pd.concat([df_true_pos, df_false_pos], keys=['True Positive', 'False Positive'], names=['Group'])
for category in cat_variables:
fig, ax = plt.subplots(1, 2, sharey=True)
#set the figure size
fig.set_figwidth(15)
sns.countplot(data = df_concat.loc['True Positive'], x = category, hue = category, ax=ax[0])
ax[0].set_xlabel('True Positive')
sns.countplot(data = df_concat.loc['False Positive'], x = category, hue = category, ax=ax[1])
ax[1].set_xlabel('False Positive')
# Calculate percentages
true_pos_count_0 = df_concat[df_concat[category] == 0].loc['True Positive'].shape[0]
true_pos_count_1 = df_concat[df_concat[category] == 1].loc['True Positive'].shape[0]
true_pos_total = df_concat.loc['True Positive'].shape[0]
true_pos_percentage = (abs(true_pos_count_0 - true_pos_count_1) / true_pos_total) * 100
flase_pos_count_0 = df_concat[df_concat[category] == 0].loc['False Positive'].shape[0]
flase_pos_count_1 = df_concat[df_concat[category] == 1].loc['False Positive'].shape[0]
flase_pos_total = df_concat.loc['False Positive'].shape[0]
flase_pos_percentage = (abs(flase_pos_count_0 - flase_pos_count_1) / flase_pos_total) * 100
# Annotate the plot with the percentage difference
ax[0].text(0.5, 0.5, f'Difference: {true_pos_percentage:.2f}%', horizontalalignment='center', verticalalignment='center', transform=ax[0].transAxes, fontsize=12, color='red')
ax[1].text(0.5, 0.5, f'Difference: {flase_pos_percentage:.2f}%', horizontalalignment='center', verticalalignment='center', transform=ax[1].transAxes, fontsize=12, color='red')
plt.suptitle(category)
plt.tight_layout()
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-91-d31c3f7b8044> in <cell line: 12>() 28 29 flase_pos_count_0 = df_concat[df_concat[category] == 0].loc['False Positive'].shape[0] ---> 30 flase_pos_count_1 = df_concat[df_concat[category] == 1].loc['False Positive'].shape[0] 31 flase_pos_total = df_concat.loc['False Positive'].shape[0] 32 /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in __getitem__(self, key) 1071 1072 maybe_callable = com.apply_if_callable(key, self.obj) -> 1073 return self._getitem_axis(maybe_callable, axis=axis) 1074 1075 def _is_scalar_access(self, key: tuple): /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1310 # fall thru to straight lookup 1311 self._validate_key(key, axis) -> 1312 return self._get_label(key, axis=axis) 1313 1314 def _get_slice_axis(self, slice_obj: slice, axis: int): /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in _get_label(self, label, axis) 1258 def _get_label(self, label, axis: int): 1259 # GH#5567 this will fail if the label is not present in the axis. -> 1260 return self.obj.xs(label, axis=axis) 1261 1262 def _handle_lowerdim_multi_index_axis0(self, tup: tuple): /usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level) 4047 4048 if isinstance(index, MultiIndex): -> 4049 loc, new_index = index._get_loc_level(key, level=0) 4050 if not drop_level: 4051 if lib.is_integer(loc): /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/multi.py in _get_loc_level(self, key, level) 3158 return indexer, maybe_mi_droplevels(indexer, ilevels) 3159 else: -> 3160 indexer = self._get_level_indexer(key, level=level) 3161 if ( 3162 isinstance(key, str) /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer) 3288 if start == end: 3289 # The label is present in self.levels[level] but unused: -> 3290 raise KeyError(key) 3291 return slice(start, end) 3292 KeyError: 'False Positive'
In [ ]:
cat_variables = ['Gender','Uncomplicated Hypertension', 'Complicated Hypertension', 'Uncomplicated Diabetes',
'Complicated Diabetes', 'Malignancy', 'Hematologic Disease',
'Metastasis', 'Peripheral Vascular Disease', 'Hypothyroidism',
'Chronic Heart Failure', 'Stroke', 'Liver Disease','Sepsis', 'Any Organ Failure',
'Severe Respiratory Failure',
'Severe Coagulation Failure', 'Severe Liver Failure',
'Severe Cardiovascular Failure','Severe Central Nervous System Failure', 'Severe Renal Failure',
'Respiratory Dysfunction', 'Cardiovascular Dysfunction',
'Renal Dysfunction', 'Hematologic Dysfunction', 'Metabolic Dysfunction', 'Neurologic Dysfunction']
df_concat = pd.concat([df_true_neg, df_false_neg], keys=['True Negative', 'False Negative'], names=['Group'])
for category in cat_variables:
fig, ax = plt.subplots(1, 2, sharey=True)
#set the figure size
fig.set_figwidth(15)
sns.countplot(data = df_concat.loc['True Negative'], x = category, hue = category, ax=ax[0])
ax[0].set_xlabel('True Negative')
sns.countplot(data = df_concat.loc['False Negative'], x = category, hue = category, ax=ax[1])
ax[1].set_xlabel('False Negative')
# Calculate percentages
true_pos_count_0 = df_concat[df_concat[category] == 0].loc['True Negative'].shape[0]
true_pos_count_1 = df_concat[df_concat[category] == 1].loc['True Negative'].shape[0]
true_pos_total = df_concat.loc['True Negative'].shape[0]
true_pos_percentage = (abs(true_pos_count_0 - true_pos_count_1) / true_pos_total) * 100
flase_pos_count_0 = df_concat[df_concat[category] == 0].loc['False Negative'].shape[0]
flase_pos_count_1 = df_concat[df_concat[category] == 1].loc['False Negative'].shape[0]
flase_pos_total = df_concat.loc['False Negative'].shape[0]
flase_pos_percentage = (abs(flase_pos_count_0 - flase_pos_count_1) / flase_pos_total) * 100
# Annotate the plot with the percentage difference
ax[0].text(0.5, 0.5, f'Difference: {true_pos_percentage:.2f}%', horizontalalignment='center', verticalalignment='center', transform=ax[0].transAxes, fontsize=12, color='red')
ax[1].text(0.5, 0.5, f'Difference: {flase_pos_percentage:.2f}%', horizontalalignment='center', verticalalignment='center', transform=ax[1].transAxes, fontsize=12, color='red')
plt.suptitle(category)
plt.tight_layout()
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-89-b44c55bc4b64> in <cell line: 12>() 22 # Calculate percentages 23 true_pos_count_0 = df_concat[df_concat[category] == 0].loc['True Negative'].shape[0] ---> 24 true_pos_count_1 = df_concat[df_concat[category] == 1].loc['True Negative'].shape[0] 25 true_pos_total = df_concat.loc['True Negative'].shape[0] 26 /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in __getitem__(self, key) 1071 1072 maybe_callable = com.apply_if_callable(key, self.obj) -> 1073 return self._getitem_axis(maybe_callable, axis=axis) 1074 1075 def _is_scalar_access(self, key: tuple): /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1310 # fall thru to straight lookup 1311 self._validate_key(key, axis) -> 1312 return self._get_label(key, axis=axis) 1313 1314 def _get_slice_axis(self, slice_obj: slice, axis: int): /usr/local/lib/python3.10/dist-packages/pandas/core/indexing.py in _get_label(self, label, axis) 1258 def _get_label(self, label, axis: int): 1259 # GH#5567 this will fail if the label is not present in the axis. -> 1260 return self.obj.xs(label, axis=axis) 1261 1262 def _handle_lowerdim_multi_index_axis0(self, tup: tuple): /usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level) 4047 4048 if isinstance(index, MultiIndex): -> 4049 loc, new_index = index._get_loc_level(key, level=0) 4050 if not drop_level: 4051 if lib.is_integer(loc): /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/multi.py in _get_loc_level(self, key, level) 3158 return indexer, maybe_mi_droplevels(indexer, ilevels) 3159 else: -> 3160 indexer = self._get_level_indexer(key, level=level) 3161 if ( 3162 isinstance(key, str) /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer) 3288 if start == end: 3289 # The label is present in self.levels[level] but unused: -> 3290 raise KeyError(key) 3291 return slice(start, end) 3292 KeyError: 'True Negative'
In [ ]:
cat_variables = ['Gender','Uncomplicated Hypertension', 'Complicated Hypertension', 'Uncomplicated Diabetes',
'Complicated Diabetes', 'Malignancy', 'Hematologic Disease',
'Metastasis', 'Peripheral Vascular Disease', 'Hypothyroidism',
'Chronic Heart Failure', 'Stroke', 'Liver Disease','Sepsis', 'Any Organ Failure',
'Severe Respiratory Failure',
'Severe Coagulation Failure', 'Severe Liver Failure',
'Severe Cardiovascular Failure','Severe Central Nervous System Failure', 'Severe Renal Failure',
'Respiratory Dysfunction', 'Cardiovascular Dysfunction',
'Renal Dysfunction', 'Hematologic Dysfunction', 'Metabolic Dysfunction', 'Neurologic Dysfunction']
df_concat = pd.concat([df_true_values, df_false_values], keys=['True Prediction', 'False Negative'], names=['Group'])
for category in cat_variables:
fig, ax = plt.subplots(1, 2, sharey=True)
#set the figure size
fig.set_figwidth(15)
sns.countplot(data = df_concat.loc['True Prediction'], x = category, hue = category, ax=ax[0])
ax[0].set_xlabel('True Prediction')
sns.countplot(data = df_concat.loc['False Negative'], x = category, hue = category, ax=ax[1])
ax[1].set_xlabel('False Negative')
# Calculate percentages
true_pos_count_0 = df_concat[df_concat[category] == 0].loc['True Prediction'].shape[0]
true_pos_count_1 = df_concat[df_concat[category] == 1].loc['True Prediction'].shape[0]
true_pos_total = df_concat.loc['True Prediction'].shape[0]
true_pos_percentage = (abs(true_pos_count_0 - true_pos_count_1) / true_pos_total) * 100
flase_pos_count_0 = df_concat[df_concat[category] == 0].loc['False Negative'].shape[0]
flase_pos_count_1 = df_concat[df_concat[category] == 1].loc['False Negative'].shape[0]
flase_pos_total = df_concat.loc['False Negative'].shape[0]
flase_pos_percentage = (abs(flase_pos_count_0 - flase_pos_count_1) / flase_pos_total) * 100
# Annotate the plot with the percentage difference
ax[0].text(0.5, 0.5, f'Difference: {true_pos_percentage:.2f}%', horizontalalignment='center', verticalalignment='center', transform=ax[0].transAxes, fontsize=12, color='red')
ax[1].text(0.5, 0.5, f'Difference: {flase_pos_percentage:.2f}%', horizontalalignment='center', verticalalignment='center', transform=ax[1].transAxes, fontsize=12, color='red')
plt.suptitle(category)
plt.tight_layout()
<ipython-input-88-345a9b7a41d2>:13: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. fig, ax = plt.subplots(1, 2, sharey=True)
Bar Plot Categorical Variables¶
In [ ]:
cat_variables = ['Gender', 'Uncomplicated Hypertension', 'Complicated Hypertension', 'Uncomplicated Diabetes',
'Complicated Diabetes', 'Malignancy', 'Hematologic Disease',
'Metastasis', 'Peripheral Vascular Disease', 'Hypothyroidism',
'Chronic Heart Failure', 'Stroke', 'Liver Disease', 'Sepsis', 'Any Organ Failure',
'Severe Respiratory Failure',
'Severe Coagulation Failure', 'Severe Liver Failure',
'Severe Cardiovascular Failure', 'Severe Central Nervous System Failure', 'Severe Renal Failure',
'Respiratory Dysfunction', 'Cardiovascular Dysfunction',
'Renal Dysfunction', 'Hematologic Dysfunction', 'Metabolic Dysfunction', 'Neurologic Dysfunction']
# Concatenate all dataframes
df_concat = pd.concat([df_true_pos, df_false_pos, df_true_neg, df_false_neg, df_true_values, df_false_values],
keys=['True Positive', 'False Positive', 'True Negative', 'False Negative', 'True Values', 'False Values'],
names=['Group'])
fig, axes = plt.subplots(len(cat_variables), 6, figsize=(26, 6 * len(cat_variables)))
fig.suptitle('Comparison of Categorical Variables', fontsize=38, fontweight='bold')
for i, category in enumerate(cat_variables):
axes[i, 2].text(0.5, 1.1, category, ha='center', va='bottom', transform=axes[i, 2].transAxes, fontsize=18, fontweight='bold')
# True Positive
sns.countplot(data=df_concat.loc['True Positive'], x=category, hue=category, ax=axes[i, 0])
axes[i, 0].set_title('True Positive')
axes[i, 0].set_xlabel('')
axes[i, 0].set_ylabel('')
axes[i, 0].legend().remove()
# False Positive
sns.countplot(data=df_concat.loc['False Positive'], x=category, hue=category, ax=axes[i, 1])
axes[i, 1].set_title('False Positive')
axes[i, 1].set_xlabel('')
axes[i, 1].set_ylabel('')
axes[i, 1].legend().remove()
# True Negative
sns.countplot(data=df_concat.loc['True Negative'], x=category, hue=category, ax=axes[i, 2])
axes[i, 2].set_title('True Negative')
axes[i, 2].set_xlabel('')
axes[i, 2].set_ylabel('')
axes[i, 2].legend().remove()
# False Negative
sns.countplot(data=df_concat.loc['False Negative'], x=category, hue=category, ax=axes[i, 3])
axes[i, 3].set_title('False Negative')
axes[i, 3].set_xlabel('')
axes[i, 3].set_ylabel('')
axes[i, 3].legend().remove()
# True Values
sns.countplot(data=df_concat.loc['True Values'], x=category, hue=category, ax=axes[i, 4])
axes[i, 4].set_title('True Values')
axes[i, 4].set_xlabel('')
axes[i, 4].set_ylabel('')
axes[i, 4].legend().remove()
# False Values
sns.countplot(data=df_concat.loc['False Values'], x=category, hue=category, ax=axes[i, 5])
axes[i, 5].set_title('False Values')
axes[i, 5].set_xlabel('')
axes[i, 5].set_ylabel('')
axes[i, 5].legend().remove()
# Calculate percentage difference for each group
for j, group in enumerate(['True Positive', 'False Positive', 'True Negative', 'False Negative', 'True Values', 'False Values']):
try:
count_0 = df_concat[df_concat[category] == 0].loc[group].shape[0]
count_1 = df_concat[df_concat[category] == 1].loc[group].shape[0]
total = df_concat.loc[group].shape[0]
percentage = (abs(count_0 - count_1) / total) * 100
except KeyError:
percentage = 100 # Assign a default value if KeyError occurs
axes[i, j].text(0.5, 0.5, f'Difference: {percentage:.2f}%', horizontalalignment='center', verticalalignment='center', transform=axes[i, j].transAxes, fontsize=12, color='black', fontweight='bold')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()